# Modified by: Mamoor Ali Khan
# Date Modified: Feb 26, 2025
# Purpose: Helper scripts for analysis and tables for full-scale IVR experiment


# Clear data
rm(list = ls())


# Install and load pacman if not already done
if (!requireNamespace("pacman", quietly = TRUE)) {
   install.packages("pacman")
}
# pacman::p_unload(all)
pacman::p_load(tidyverse, rlang, here)




get_pvalue_decorations <- function(pvalues) {
   pvalue_decorations <- c("", "^{\\dagger}", "^{*}", "^{**}", "^{***}")

   pvalue_decorations[
      as.integer(as.character(cut(
         pvalues,
         breaks = c(0, 0.001, 0.01, 0.05, 0.1, 1),
         labels = 5:1,
         include.lowest = TRUE
      )))
   ]
}

# tools to help return results
tidy.data.frame <- function(dat) {
   dat
}
tidy_m <- function(model) {
   n <- model$nobs
   rsq <- model$r.squared

   model %>%
      tidy() %>%
      mutate(N = n, rsq = rsq)
}

# This function checks the pre-specified covariates for whether or not they predict a
# given outcome in the control group. It returns a data.frame with the relevant p-values
pick_covariates <- function(
    dat,
    outcome,
    covariates = c(
       "age_bin",
       "income_scale",
       "educ_bin",
       "pol_knowledge",
       "copartisan",
       "party_support_bin",
       "voted"
    ),
    weights = NULL) {
   if (!is.null(weights)) {
      weights <- enquo(weights)
   }

   map_dfr(covariates, ~ {
      lmo <- lm_robust(as.formula(paste0(outcome, " ~ ", .x)), dat, subset = w1_s1_treat == "H1_control", weights = !!weights)
      pval <- pf(lmo$fstatistic[1L], lmo$fstatistic[2L], lmo$fstatistic[3L], lower.tail = FALSE)
      tibble(covariate = .x, fstat_pvalue = pval)
   })
}

# This function fits different models for different treatments
# `dat` is the full data.frame
# `treatments` is vector of treatment variable names
# `outcome_baseline` is the name of the outcome at baseline
# `outcome_endline` is the name at endline, defaults to `outcome_baseline`_end
# `subset_var` is the name of a variable to interact with the treatment
# `covariates` is a logical for whether or not to add pre-treatment covariates to the model
fit_models <- function(
    dat,
    treatments,
    outcome_endline,
    outcome_baseline = NULL,
    subset_var = NULL,
    covariates = TRUE,
    models = c("DiM", "PAP", "LATE"),
    return_pap_model_only = FALSE,
    weights = NULL,
    instrumented_treatments = NULL # named list of instrumented treatments where name is matched tvar
    ) {
   # if (!is.null(weights)) {
   weights <- enquo(weights)
   # }
   # print(weights)

   # If covariates, then find the covariates that predict the outcome in the control
   # group (p < 0.05) and admit those in the final model
   if (covariates) {
      cov_df <- pick_covariates(dat, outcome_endline)
      covs <- cov_df$covariate[cov_df$fstat_pvalue < 0.05]
   } else {
      covs <- NULL
   }

   # Add baseline measurement of outcome on RHS if it is in the data
   if (is.character(outcome_baseline) && outcome_baseline %in% names(dat)) {
      covs <- c(outcome_baseline, covs)
   }

   out_dat <- map_dfr(
      treatments,
      ~ {
         base_level <- levels(as.factor(dat[[.x]]))[1]

         ctrl_dat <- dat[as.factor(dat[[.x]]) == base_level, ]

         # If a subset variable is passed, interact it with the treatment
         if (!is.null(subset_var)) {
            tvar <- paste0(.x, " * ", subset_var)
         } else {
            tvar <- .x
         }

         dat

         pap_covs <- paste0(paste0(covs, collapse = " + "), " + ps_name")

         # Difference-in-means specification
         if ("DiM" %in% models) {
            dim_m <- lm_robust(
               as.formula(paste0(outcome_endline, " ~ ", tvar)),
               dat,
               weights = !!weights
            )
            dim_o <- tidy_m(dim_m)
            dim_o$model <- "DiM"
         } else {
            dim_o <- NULL
         }

         # Pre-analysis plan specification (covariates that pass, baseline measurement, PS FEs)
         if ("PAP" %in% models) {
            pap_m <- lm_robust(
               as.formula(paste0(outcome_endline, " ~ ", tvar, " + ", pap_covs)),
               dat,
               weights = !!weights
            )
            if (return_pap_model_only) {
               return(pap_m)
            }
            pap_o <- tidy_m(pap_m)
            pap_o$model <- "PAP"
         } else {
            pap_o <- NULL
         }

         if ("LATE" %in% models & tvar %in% names(instrumented_treatments)) {
            iv_o <- map_dfr(
               instrumented_treatments[[tvar]],
               ~ {
                  ivtemp_m <- iv_robust(
                     as.formula(paste0(outcome_endline, " ~ ", .x, " + ", pap_covs, " | ", tvar, " + ", pap_covs)),
                     dat,
                     weights = !!weights
                  )
                  ivtemp_o <- tidy_m(ivtemp_m)
                  ivtemp_o$model <- paste0("LATE-", .x)
                  ivtemp_o
               }
            )
         } else {
            iv_o <- NULL
         }

         if (!is.null(subset_var)) {
            # If subsetting effects, test for effects in both subgroups and the difference
            # between them
            possible_t_names <- dim_o$term[grepl(.x, dim_o$term) & !grepl(subset_var, dim_o$term)]

            linhyp_dfs <- map_dfr(possible_t_names, ~ {
               hypothesis <- paste0(.x, " + ", .x, ":", subset_var, " = 0")

               if ("DiM" %in% models) {
                  dim_linhyp <- car::linearHypothesis(
                     dim_m,
                     hypothesis.matrix = hypothesis,
                     test = "F"
                  )

                  dim_linhyp_o <- data.frame(
                     term = paste0(.x, "+", subset_var),
                     estimate = attr(dim_linhyp, "value")[1, 1],
                     std.error = sqrt(attr(dim_linhyp, "vcov")[1, 1]),
                     p.value = dim_linhyp[["Pr(>F)"]][2],
                     outcome = outcome_endline,
                     model = "DiM"
                  )
               } else {
                  dim_linhyp_o <- NULL
               }

               if ("PAP" %in% models) {
                  pap_linhyp <- car::linearHypothesis(
                     pap_m,
                     hypothesis.matrix = hypothesis,
                     test = "F"
                  )

                  pap_linhyp_o <- data.frame(
                     term = paste0(.x, "+", subset_var),
                     estimate = attr(pap_linhyp, "value")[1, 1],
                     std.error = sqrt(attr(pap_linhyp, "vcov")[1, 1]),
                     p.value = pap_linhyp[["Pr(>F)"]][2],
                     outcome = outcome_endline,
                     model = "PAP"
                  )
               } else {
                  pap_linhyp_o <- NULL
               }

               bind_rows(pap_linhyp_o, dim_linhyp_o)
            })
         } else {
            linhyp_dfs <- NULL
         }

         # Add control mean and sd to output for printing
         out_dat <- bind_rows(dim_o, pap_o, linhyp_dfs, iv_o)
         out_dat %>%
            mutate(
               treatment = .x,
               ctrl_mean = mean(ctrl_dat[[outcome_endline]], na.rm = TRUE),
               ctrl_sd = sd(ctrl_dat[[outcome_endline]], na.rm = TRUE)
            )
      }
   )
   out_dat
}

# prep table for printing tex
prepare_results <- function(
    outcomes,
    results,
    treatment_name,
    labels,
    treatment_labels,
    include_control = TRUE,
    blank_table = FALSE,
    model_use = "PAP",
    treatment_header = "ITT: ",
    variable_header = "Outcome",
    number_cols = TRUE) {
   header <- paste0(paste0(
      c(
         variable_header,
         if (include_control) "\\multicolumn{1}{c}{Control mean}" else NULL,
         paste0("\\multicolumn{1}{P{2cm}}{\\centering ", treatment_header, treatment_labels[[treatment_name]], "}"),
         "N" # ,
         # "R^2"
      ),
      collapse = " & "
   ), "\\\\")

   n_cols <- as.integer(include_control) + length(treatment_labels[[treatment_name]]) + 1

   header2 <- paste0(paste0(
      c("", paste0("\\multicolumn{1}{P{2in}}{(", seq_len(n_cols), ")}")),
      collapse = " & "
   ), " \\\\")

   outcome_rows <- map(outcomes, ~ {
      out_dat <- filter(
         results,
         grepl(treatment_name, term),
         outcome == .x,
         model == model_use
      )

      if (blank_table) {
         out_dat <- out_dat %>%
            mutate_at(
               vars(estimate, std.error, N),
               funs(ifelse(TRUE, "", .))
            ) %>%
            mutate(p.value = 1)
         ctrl_mean <- ""
         ctrl_sd <- ""
      } else {
         ctrl_mean <- out_dat[["ctrl_mean"]][1]
         ctrl_sd <- out_dat[["ctrl_sd"]][1]
      }

      out_label <- labels[.x]

      if (blank_table) {
         format_numbers <- "%s"
      } else {
         format_numbers <- "%3.3f"
      }

      row_1 <- paste0(
         c(
            out_label,
            if (include_control) sprintf(format_numbers, ctrl_mean) else NULL,
            paste0(
               sprintf(format_numbers, out_dat$estimate),
               get_pvalue_decorations(out_dat$p.value)
            ),
            out_dat$N[1] # ,
            # sprintf("%3.3f", out_dat$rsq[1])
         ),
         collapse = " & "
      )

      row_2 <- paste0(
         c(
            "",
            if (include_control) paste0("(", sprintf(format_numbers, ctrl_sd), ")") else NULL,
            paste0("(", sprintf(format_numbers, out_dat$std.error), ")"),
            ""
         ),
         collapse = " & "
      )

      row_4 <- paste0(rep(" ", times = 2 + include_control + nrow(out_dat)), collapse = "&")

      map_chr(
         c(
            row_1,
            row_2,
            row_4
         ),
         paste0, "\\\\"
      )
   })

   if (number_cols) {
      ret <- c(header, header2, "\\midrule", unlist(outcome_rows, FALSE, FALSE))
   } else {
      ret <- c(header, "\\midrule", unlist(outcome_rows, FALSE, FALSE))
   }

   ret
}


# prep cleaner table (2020-03-19)
prepare_simple_results <- function(
    outcomes,
    results,
    labels,
    treatment_titles,
    treatment_group_titles,
    treatment_terms,
    control_title,
    control_group_title,
    blank_table = FALSE,
    model_use = "PAP",
    variable_header = "Outcome",
    number_cols = TRUE) {
   stopifnot(
      length(treatment_titles) == length(treatment_group_titles),
      length(treatment_titles) == length(treatment_terms)
   )

   tgroup_cols <- 2
   header_1 <- paste0(paste0(
      c(
         " ",
         paste0("\\multicolumn{1}{P{1.2in}}{", control_title, "}"),
         paste0("\\multicolumn{", tgroup_cols, "}{P{2in}}{", treatment_titles, "}")
      ),
      collapse = " & "
   ), "\\\\")

   header_2 <- paste0(paste0(
      c(
         " ",
         paste0("\\multicolumn{1}{P{1.2in}}{", control_group_title, "}"),
         paste0("\\multicolumn{", tgroup_cols, "}{P{2in}}{", treatment_group_titles, "}")
      ),
      collapse = " & "
   ), "\\\\")

   header_lines <- paste(
      c(
         "\\cmidrule(lr){2-2}",
         map_chr(
            seq_len(length(treatment_titles)) * 2 + 1,
            ~ {
               paste0(
                  "\\cmidrule(lr){", .x, "-", .x + tgroup_cols - 1, "}"
               )
            }
         )
      ),
      collapse = " "
   )

   n_cols <- length(treatment_titles) * tgroup_cols + 2

   label_row <- paste0(paste0(
      c(
         variable_header,
         paste0("\\multicolumn{1}{c}{$\\mu$}"),
         rep(c("\\multicolumn{1}{c}{$\\tau$}", "\\multicolumn{1}{c}{N}"), times = length(treatment_titles))
      ),
      collapse = " & "
   ), "\\\\")

   outcome_rows <- map(outcomes, ~ {
      current_outcome <- .x
      out_label <- labels[current_outcome]

      out_dats <- map(
         treatment_terms,
         ~ {
            filter(
               results,
               grepl(.x, term),
               outcome == current_outcome,
               model == model_use
            )
         }
      )

      ctrl_dat <- map_dfr(
         out_dats,
         ~ {
            tibble(
               ctrl_mean = .x[["ctrl_mean"]][1],
               ctrl_sd = .x[["ctrl_sd"]][1]
            )
         }
      )

      # Confirm control means are the same for both treatment groups
      ctrl_dat <- ctrl_dat[!duplicated(ctrl_dat), ]
      stopifnot(nrow(ctrl_dat) == 1)

      format_numbers <- "%3.3f"

      row_1 <- paste0(
         c(
            out_label,
            sprintf(format_numbers, ifelse(abs(ctrl_dat$ctrl_mean) < 1e-13, 0.0, ctrl_dat$ctrl_mean)),
            unlist(
               map(
                  out_dats,
                  ~ {
                     c(
                        paste0(
                           sprintf(format_numbers, .x$estimate),
                           get_pvalue_decorations(.x$p.value)
                        ),
                        .x$N[1]
                     )
                  }
               )
            )
         ),
         collapse = " & "
      )

      row_2 <- paste0(
         c(
            "",
            paste0("(", sprintf(format_numbers, ctrl_dat$ctrl_sd), ")"),
            unlist(
               map(
                  out_dats,
                  ~ {
                     c(paste0("(", sprintf(format_numbers, .x$std.error), ")"), "")
                  }
               )
            )
         ),
         collapse = " & "
      )

      row_4 <- paste0(rep(" ", times = n_cols), collapse = "&")

      map_chr(
         c(
            row_1,
            row_2,
            row_4
         ),
         paste0, "\\\\"
      )
   })

   c(header_1, header_2, header_lines, label_row, "\\midrule", unlist(outcome_rows, FALSE, FALSE))
}

prepare_subset_results <- function(outcomes, results, treatment_name, labels, treatment_labels, subset_var, subset_label, model = "PAP") {
   header <- paste0(paste0(
      c(
         "",
         "",
         paste0("\\multicolumn{3}{p{4cm}}{\\centering ", treatment_labels[[treatment_name]], "}"),
         ""
      ),
      collapse = " & "
   ), "\\\\")

   n_treats <- length(treatment_labels[[treatment_name]])

   header_cline <- paste0("\\cmidrule(lr){", seq(3, 3 * n_treats, by = 3), "-", seq(5, 2 + 3 * n_treats, by = 3), "}")

   header2_vec <- c(
      "Outcome",
      "\\multicolumn{1}{c}{Control mean}",
      rep(
         c(
            paste0("\\multicolumn{1}{c}{", subset_label, " = ", 0:1, "}"),
            "\\multicolumn{1}{c}{Diff}"
         ),
         times = n_treats
      ),
      "N" # ,
      # "R^2"
   )
   header2 <- paste0(paste0(
      header2_vec,
      collapse = " & "
   ), " \\\\")

   header3 <- paste0(paste0(
      c("", paste0("\\multicolumn{1}{c}{(", seq_len(length(header2_vec) - 1), ")}")),
      collapse = " & "
   ), " \\\\")


   outcome_rows <- map(outcomes, ~ {
      out_dat <- filter(
         results,
         grepl(treatment_name, term),
         outcome == .x,
         model == model
      )

      # relies on order of treatment names!
      out_dat <- out_dat %>%
         mutate(term = gsub("\\+", "_", term)) %>%
         arrange(term)

      out_label <- labels[.x]
      ctrl_mean <- out_dat[["ctrl_mean"]][1]
      ctrl_sd <- out_dat[["ctrl_sd"]][1]

      row_1 <- paste0(
         c(
            out_label,
            sprintf("%3.3f", ctrl_mean),
            paste0(
               sprintf("%3.3f", out_dat$estimate),
               get_pvalue_decorations(out_dat$p.value)
            ),
            out_dat$N[1]
         ),
         collapse = " & "
      )

      row_2 <- paste0(
         c(
            "",
            paste0("(", sprintf("%3.3f", ctrl_sd), ")"),
            paste0("(", sprintf("%3.3f", out_dat$std.error), ")"),
            ""
         ),
         collapse = " & "
      )

      row_4 <- paste0(rep(" ", times = 3 + nrow(out_dat)), collapse = "&")

      map_chr(
         c(
            row_1,
            row_2,
            row_4
         ),
         paste0, "\\\\"
      )
   })

   c(header, header_cline, header2, "\\midrule", unlist(outcome_rows, FALSE, FALSE))
}

# Unused indices functions for now, but TODO incorporate (based on HH pipeline)


create_indices <- function(data, id_variable, treatment_variable, control_value, ...) {
   treatment_variable <- enquo(treatment_variable)
   id_variable <- enquo(id_variable)
   index_vars <- enquos(...)

   data %>%
      select(!!id_variable, !!treatment_variable, ...) %>%
      gather(outcome, value, -!!id_variable, -!!treatment_variable) %>%
      # Standardize outcome values to be mean 0 and sd 1 in control condition
      group_by(outcome) %>%
      mutate(value = (value - mean(value[!!treatment_variable == control_value], na.rm = TRUE)) /
         sd(value[!!treatment_variable == control_value], na.rm = TRUE)) %>%
      # For each unit, check if at least one index component is non-missing
      group_by(!!id_variable) %>%
      mutate(valid_for_index = any(!is.na(value))) %>%
      # Impute missing values by outcome and treatment condition
      group_by(!!treatment_variable, outcome) %>%
      mutate(
         value = case_when(
            # If at least one non-missing component and missing value,
            # impute with mean of that variable for the same treatment condition
            valid_for_index & is.na(value) ~ mean(value, na.rm = TRUE),
            # Otherwise, leave as realized value
            TRUE ~ value
         )
      ) %>%
      # Compute index by unit
      group_by(!!id_variable, !!treatment_variable) %>%
      # Get the mean of the outcome values by unit
      summarize(index = mean(value)) %>%
      ungroup() %>%
      # Standardize index again to create second version that is mean 0 and sd 1 in control condition
      mutate(index_std = (index - mean(index[!!treatment_variable == control_value], na.rm = TRUE)) /
         sd(index[!!treatment_variable == control_value], na.rm = TRUE))
}
